Overview

Aggregate studies

print(studies)
##  [1] "BIDMC-FMT"                           
##  [2] "CS-PRISM"                            
##  [3] "Herfarth_CCFA_Microbiome_3B_combined"
##  [4] "HMP2"                                
##  [5] "Jansson_Lamendella_Crohns"           
##  [6] "LSS-PRISM"                           
##  [7] "MucosalIBD"                          
##  [8] "Pouchitis"                           
##  [9] "PROTECT"                             
## [10] "RISK"
l_otus <- l_species <- l_genera <- list()
for(study in studies) {
  load(paste0("data/phyloseq/",
              study,
              "/otus.RData"))
  load(paste0("data/phyloseq/",
              study,
              "/species.RData"))
  load(paste0("data/phyloseq/",
              study,
              "/genera.RData"))
  l_otus[[study]] <- phylo
  l_species[[study]] <- phylo_species
  l_genera[[study]] <- phylo_genera
}
phylo_otus <- combine_phyloseq(l_otus)
phylo_species <- combine_phyloseq(l_species)
phylo_genera <- combine_phyloseq(l_genera)

Grouping otus into genera seems more appropriate than species

tb_long <- l_otus %>% 
  purrr::map_dfr(
    function(phylo_otus) {
      tb_ra_otus <- phylo_otus %>% 
        transform_sample_counts(tss) %>% 
        phyloseq_to_tb()
      tb_otu_taxonamy <- phylo_otus %>% 
        tax_table2() %>% 
        tibble::as_tibble(rownames = "feature") %>% 
        dplyr::mutate(classification = dplyr::case_when(
          Rank7 != "s__" ~ "fully classified",
          Rank1 == "k__" ~ "unclassified at kingdom",
          Rank2 == "p__" ~ "unclassified at phylumn",
          Rank3 == "c__" ~ "unclassified at class",
          Rank4 == "o__" ~ "unclassified at order",
          Rank5 == "f__" ~ "unclassified at family",
          Rank6 == "g__" ~ "unclassified at genus",
          Rank7 == "s__" ~ "unclassified at species"
        ) %>% 
          factor(levels= c("fully classified",
                           "unclassified at species",
                           "unclassified at genus",
                           "unclassified at family",
                           "unclassified at order",
                           "unclassified at class",
                           "unclassified at phylumn",
                           "unclassified at kingdom")))
      
      tb_ra_otus %>% 
        dplyr::left_join(tb_otu_taxonamy,
                         by = c("feature", "Rank1", "Rank2", "Rank3", 
                                "Rank4", "Rank5", "Rank6", "Rank7"))
    }
  )
p <- tb_long %>% 
  dplyr::group_by(dataset_name, feature, classification) %>% dplyr::filter(!duplicated(feature)) %>% dplyr::summarise(n = n()) %>%
  ggplot(aes(x = dataset_name, y = n, fill = classification)) +
  geom_bar(stat = "identity") +
  rotate_xaxis(30) +
  ggtitle("OTUs are mostly classified at order, family, genus, and species level")
print(p) %>% 
  ggsave(paste0(dir_output, "fig_OTUClassification.pdf"),
         .,
         width = 8,
         height = 5)

p <- tb_long %>% 
  dplyr::group_by(dataset_name, rownames, classification) %>% dplyr::summarise(abundance = sum(abundance)) %>% dplyr::ungroup() %>% 
  dplyr::group_by(dataset_name, rownames) %>% dplyr::mutate(abundance_fullyClassified = 
                                                              sum(abundance[classification %in% c("fully classified",
                                                                                                  "unclassified at species")])) %>% dplyr::ungroup() %>% 
  dplyr::arrange(dataset_name, abundance_fullyClassified) %>% dplyr::mutate(rownames = factor(rownames, levels = unique(rownames))) %>% 
  ggplot(aes(x = rownames, y = abundance, fill = classification)) +
  geom_bar(stat = "identity") +
  facet_grid(.~dataset_name, scales = "free_x", space = "free_x") +
  no_label_xaxis() +
  ggtitle("OTUs are most abundantly classified at family, genus, and species level.")
print(p) %>% 
  ggsave(paste0(dir_output, "fig_speciesClassificationAbd.pdf"),
         .,
         width = 20,
         height = 6)

tb_long_speciesUnclassified <- tb_long %>% 
  dplyr::filter(classification == "unclassified at species") %>% 
  dplyr::group_by(dataset_name, rownames, Rank6, Rank7) %>% dplyr::summarise(abundance = sum(abundance)) %>% dplyr::ungroup() %>% 
  dplyr::group_by(Rank6, Rank7) %>% dplyr::mutate(mean_abundance = mean(abundance)) %>% dplyr::ungroup() %>% 
  dplyr::mutate(taxonamy = ifelse(mean_abundance >= sort(unique(mean_abundance), decreasing = TRUE)[10],
                                  paste(Rank6, Rank7, sep = "|"), "others")) %>% 
  dplyr::arrange(dplyr::desc(mean_abundance)) %>% dplyr::mutate(taxonamy = factor(taxonamy, levels = unique(taxonamy))) %>%
  dplyr::group_by(dataset_name, rownames) %>% dplyr::mutate(all_abundance = sum(abundance)) %>% dplyr::ungroup() %>% 
  dplyr::arrange(dataset_name, all_abundance) %>% dplyr::mutate(rownames = factor(rownames, levels = unique(rownames)))
p1 <- tb_long_speciesUnclassified %>% 
  ggplot(aes(x = rownames, y = abundance, fill = taxonamy)) +
  geom_bar(stat = "identity") +
  facet_grid(.~dataset_name, scales = "free_x", space = "free_x") +
  no_label_xaxis() +
  ggtitle("Abundance of OTUs unclassified at species level")
p2 <- tb_long %>% 
  dplyr::filter(classification == "unclassified at species") %>% 
  dplyr::group_by(dataset_name) %>% 
  dplyr::filter(!duplicated(feature)) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(taxonamy = paste(Rank6, Rank7, sep = "|")) %>% 
  dplyr::filter(taxonamy %in% levels(tb_long_speciesUnclassified$taxonamy)) %>% 
  dplyr::mutate(taxonamy = factor(taxonamy, levels = levels(tb_long_speciesUnclassified$taxonamy))) %>% 
  dplyr::group_by(dataset_name, taxonamy) %>% 
  dplyr::summarise(n_otus = n()) %>% 
  ggplot(aes(x = dataset_name, y = n_otus, fill = taxonamy)) +
  geom_bar(stat = "identity") +
  rotate_xaxis(30)
cowplot::plot_grid(p1, p2, nrow = 1, rel_widths = c(0.8, 0.2)) %>% 
  ggsave(paste0(dir_output, "fig_speciesUnclassified.pdf"),
         .,
         width = 25,
         height = 6)

tb_long_generaUnclassified <- tb_long %>% 
  dplyr::filter(classification == "unclassified at genus") %>% 
  dplyr::group_by(dataset_name, rownames, Rank5, Rank6) %>% dplyr::summarise(abundance = sum(abundance)) %>% dplyr::ungroup() %>% 
  dplyr::group_by(Rank5, Rank6) %>% dplyr::mutate(mean_abundance = mean(abundance)) %>% dplyr::ungroup() %>% 
  dplyr::mutate(taxonamy = ifelse(mean_abundance >= sort(unique(mean_abundance), decreasing = TRUE)[10],
                                  paste(Rank5, Rank6, sep = "|"), "others")) %>% 
  dplyr::arrange(dplyr::desc(mean_abundance)) %>% dplyr::mutate(taxonamy = factor(taxonamy, levels = unique(taxonamy))) %>%
  dplyr::group_by(dataset_name, rownames) %>% dplyr::mutate(all_abundance = sum(abundance)) %>% dplyr::ungroup() %>% 
  dplyr::arrange(dataset_name, all_abundance) %>% dplyr::mutate(rownames = factor(rownames, levels = unique(rownames)))
p1 <- tb_long_generaUnclassified %>% 
  ggplot(aes(x = rownames, y = abundance, fill = taxonamy)) +
  geom_bar(stat = "identity") +
  facet_grid(.~dataset_name, scales = "free_x", space = "free_x") +
  no_label_xaxis() +
  ggtitle("Abundance of OTUs unclassified at genera level")
p2 <- tb_long %>% 
  dplyr::filter(classification == "unclassified at genus") %>% 
  dplyr::group_by(dataset_name) %>% 
  dplyr::filter(!duplicated(feature)) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(taxonamy = paste(Rank5, Rank6, sep = "|")) %>% 
  dplyr::filter(taxonamy %in% levels(tb_long_generaUnclassified$taxonamy)) %>% 
  dplyr::mutate(taxonamy = factor(taxonamy, levels = levels(tb_long_generaUnclassified$taxonamy))) %>% 
  dplyr::group_by(dataset_name, taxonamy) %>% 
  dplyr::summarise(n_otus = n()) %>% 
  ggplot(aes(x = dataset_name, y = n_otus, fill = taxonamy)) +
  geom_bar(stat = "identity") +
  rotate_xaxis(30)
p3 <- tb_long %>% 
  dplyr::filter(classification == "unclassified at genus",
                Rank5 == "f__Enterobacteriaceae",
                Rank6 == "g__") %>% 
  dplyr::group_by(dataset_name) %>% 
  dplyr::filter(!duplicated(feature)) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(taxonamy = paste(Rank5, Rank6, sep = "|")) %>% 
  ggplot(aes(x = dataset_name)) +
  geom_bar() +
  rotate_xaxis(30) +
  ggtitle("# unclassified f_Enterobacteriacea OTUs")
cowplot::plot_grid(p1, p2, p3, nrow = 1, rel_widths = c(4, 1, 1)) %>% 
  ggsave(paste0(dir_output, "fig_generaUnclassified.pdf"),
         .,
         width = 30,
         height = 6)

Examine alpha diversity.

tb_nTaxaSampleLibSize <- list(otus = l_otus,
                              species = l_species,
                              genera = l_genera) %>% 
  purrr::imap_dfr(function(l_phylo, feature_level) {
    purrr::imap_dfr(l_phylo, function(phylo, study) {
      tibble::tibble(
        feature_level = feature_level,
        study = study,
        N_taxa = phyloseq::ntaxa(phylo),
        N_samples = phyloseq::nsamples(phylo),
        median_log10LibSize = phylo %>% phyloseq::sample_sums() %>% 
          add(1) %>% log(10) %>% 
          median()
      )
    })
  })
p1 <- tb_nTaxaSampleLibSize %>% 
  ggplot(aes(x = N_samples,
             y = N_taxa)) +
  geom_point() +
  ggrepel::geom_text_repel(aes(label = study)) +
  facet_wrap(~factor(feature_level, levels = c("otus", "species", "genera")),
             scales = "free_y")
p2 <- tb_nTaxaSampleLibSize %>% 
  ggplot(aes(x = median_log10LibSize,
             y = N_taxa)) +
  geom_point() +
  ggrepel::geom_text_repel(aes(label = study)) +
  facet_wrap(~factor(feature_level, levels = c("otus", "species", "genera")),
             scales = "free_y")
cowplot::plot_grid(p1, p2, ncol = 1) %>% 
  cowplot_title("Alpha diversity across studies") %>% 
  ggsave(file = paste0(dir_output, "fig_alphaDiversity.pdf"),
         width = 15,
         heigh = 10)

Examine uniquely present and absent taxa.

list(otus = phylo_otus,
     species = phylo_species,
     genera = phylo_genera) %>% 
  purrr::iwalk(function(phylo, feature_level) {
    tb_long <- phylo %>% 
      phyloseq::transform_sample_counts(tss) %>% 
      phyloseq_to_tb()
    tb_presence <- tb_long %>% 
      dplyr::group_by(feature, dataset_name) %>% 
      dplyr::summarise(present = any(abundance > 0)) %>% 
      dplyr::group_by(feature) %>% 
      dplyr::mutate(n_present = sum(present),
                    n_absent = sum(!present))
    tb_long <- tb_long %>% 
      dplyr::left_join(tb_presence,
                       by = c("feature", "dataset_name"))
    tb_uniqPres <- tb_long %>% 
      dplyr::filter(n_present == 1, present) %>% 
      dplyr::group_by(feature, dataset_name) %>% 
      dplyr::summarise(mean_abundance = mean(abundance)) %>% 
      dplyr::ungroup()
    tb_uniqAbs <- tb_long %>% 
      dplyr::filter(n_absent == 1, present) %>% 
      dplyr::group_by(feature) %>% 
      dplyr::summarise(mean_abundance = mean(abundance)) %>% 
      dplyr::ungroup() %>% 
      dplyr::left_join(
        tb_presence %>% 
          dplyr::filter(n_absent == 1,
                        !present),
        by = "feature"
      ) %>% 
      dplyr::select(feature, dataset_name, mean_abundance)
    
    # Table summarises feature presence
    tb_presence %>% 
      dplyr::filter(!duplicated(feature)) %>% 
      dplyr::group_by(n_present) %>% 
      dplyr::summarise(N = n()) %>% 
      t() %>% 
      DT::datatable(caption = feature_level) %>% 
      print()
    
    # Figure uniquely present features
    p <- tb_uniqPres %>% 
      dplyr::arrange(dataset_name, mean_abundance) %>% 
      dplyr::mutate(feature = factor(feature, levels = feature)) %>% 
      ggplot(aes(x = feature, y = mean_abundance, color = dataset_name)) + geom_point() +
      rotate_xaxis(90) 
    ggsave(filename = paste0(dir_output, "fig_", feature_level, "UniquePresent.pdf"),
           plot = p,
           width = 40,
           height = 10)
    p <- tb_uniqPres %>% 
      dplyr::filter(mean_abundance > 1e-4) %>% 
      dplyr::arrange(dataset_name, mean_abundance) %>% 
      dplyr::mutate(feature = factor(feature, levels = feature)) %>% 
      ggplot(aes(x = feature, y = mean_abundance, color = dataset_name)) + geom_point() +
      rotate_xaxis(90) + coord_flip()
    p %>% print() %>% 
      ggsave(filename = paste0(dir_output, "fig_", feature_level, "UniquePresentZoom.pdf"),
             plot = .,
             width = 15,
             height = 10)
    
    # Figure uniquely absent features
    p <- tb_uniqAbs %>% 
      dplyr::arrange(dataset_name, mean_abundance) %>% 
      dplyr::mutate(feature = factor(feature, levels = feature)) %>% 
      ggplot(aes(x = feature, y = mean_abundance, color = dataset_name)) + geom_point() +
      rotate_xaxis(90) + coord_flip() 
    p %>% print() %>% 
      ggsave(filename = paste0(dir_output, "fig_", feature_level, "UniqueAbsent.pdf"),
             plot = .,
             width = 15,
             height = 10)
  })

## Individual OTUS

mat_tax_tmp <- tax_table2(l_species$Jansson_Lamendella_Crohns)
mat_tax_tmp %>% 
  tibble::as_tibble(rownames = "otu_cluster") %>% 
  dplyr::filter(Rank7 == "s__oncorhynchi")
mat_tax_tmp <- tax_table2(l_species$MucosalIBD)
mat_tax_tmp %>% 
  tibble::as_tibble(rownames = "otu_cluster") %>% 
  dplyr::filter(Rank7 == "s__blattae")
tax_table2(phylo_genera) %>% 
  tibble::as_tibble(rownames = "feature") %>% 
  dplyr::filter(Rank6 == "g__Escherichia")

Examine library sizes

# sanity check
tb_libSize <- list(otu = l_otus,
                   species = l_species, 
                   genus = l_genera) %>% 
  purrr::imap_dfr(function(l_phylo, feature_level) {
    l_phylo %>% purrr::imap_dfr(function(phylo, dataset_name) {
      tibble::tibble(
        sample_accession_16S = phyloseq::sample_names(phylo),
        dataset_name = dataset_name,
        lib_size = phyloseq::sample_sums(phylo),
        feature_level = feature_level)
    })
  })
tb_libSize %>% 
  dplyr::select(dataset_name, sample_accession_16S, feature_level, lib_size) %>% 
  tidyr::spread(key = feature_level, value = lib_size) %>% 
  ggplot(aes(x = genus, y = otu)) + geom_point() + ggtitle("lib size otus vs. genera level")

tb_libSize <- tb_libSize %>% dplyr::filter(feature_level == "genus")
tb_readCounts <- studies %>% 
  purrr::map_dfr(~ paste0(dir_processed, 
                          "processed/", 
                          .x, 
                          "/16s/all_samples_read_counts.tsv") %>% 
                   readr::read_tsv(col_types = "cddd") %>% 
                   dplyr::mutate(dataset_name = .x)) %>% 
  dplyr::rename(original_read_count = `original read count`,
                reads_mapping_to_OTU_with_taxonomy = `reads mapping to OTU with taxonomy`,
                reads_mapping_to_unclassifed_OTU = `reads mapping to unclassifed OTU`)
tb_libSize <- tb_libSize %>% 
  dplyr::left_join(tb_readCounts,
                   c("dataset_name" = "dataset_name",
                     "sample_accession_16S" = "# sample"))
tb_libSize %>% 
  ggplot(aes(x = lib_size, y = reads_mapping_to_OTU_with_taxonomy)) + geom_point() +
  facet_wrap(~dataset_name) + ggtitle("The two should be the same")# the two are equal

tb_libSize %>% 
  tidyr::gather(key = "Type of read count",
                value = "N",
                original_read_count,
                lib_size,
                reads_mapping_to_OTU_with_taxonomy,
                reads_mapping_to_unclassifed_OTU,
                factor_key = TRUE) %>% 
  ggplot(aes(x = dataset_name, y = log10(N + 1), color = `Type of read count`)) + 
  geom_boxplot() + rotate_xaxis(90)

ggsave(filename = paste0(dir_output, "fig_libSize.pdf"),
       width = 10,
       height = 8)
p1 <- tb_libSize %>% 
  ggplot(aes(x = dataset_name, fill = (lib_size > 3000))) +
  geom_bar(stat = "count") +
  rotate_xaxis(30) +
  ggtitle("Cutoff = 3000")
p2 <- tb_libSize %>% 
  ggplot(aes(x = dataset_name, fill = (lib_size > 1000))) +
  geom_bar(stat = "count") +
  rotate_xaxis(30) +
  ggtitle("Cutoff = 1000")
cowplot::plot_grid(p1, p2, ncol = 1) %>% 
  cowplot_title(title = "Lib size filtering cut off?") %>% print() %>%
  ggsave(filename = paste0(dir_output, "fig_libSizeCutoff.pdf"),
         width = 10,
         height = 8)

sample_data(phylo_species) <- 
  sample_data2(phylo_species) %>% 
  tibble::rownames_to_column() %>% 
  dplyr::left_join(tb_readCounts, 
                   by = c("dataset_name" = "dataset_name", 
                          "sample_accession_16S" = "# sample")) %>% 
  tibble::column_to_rownames()
sample_data(phylo_genera) <- 
  sample_data2(phylo_genera) %>%
  tibble::rownames_to_column() %>% 
  dplyr::left_join(tb_readCounts, 
                   by = c("dataset_name" = "dataset_name", 
                          "sample_accession_16S" = "# sample")) %>% 
  tibble::column_to_rownames()

Filter samples and taxa

print(phylo_genera)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1149 taxa and 6282 samples ]
## sample_data() Sample Data:       [ 6282 samples by 55 sample variables ]
## tax_table()   Taxonomy Table:    [ 1149 taxa by 8 taxonomic ranks ]
phylo_genera <- phylo_genera %>% phyloseq::subset_samples(!is.na(body_site) & !is.na(disease))
print(phylo_genera)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1149 taxa and 5248 samples ]
## sample_data() Sample Data:       [ 5248 samples by 55 sample variables ]
## tax_table()   Taxonomy Table:    [ 1149 taxa by 8 taxonomic ranks ]
phylo_genera <- phylo_genera %>% 
  prune_taxaSamples(flist_taxa = kOverA2(k = phyloseq::nsamples(phylo_genera) * 0.01,
                                         A = 1e-4),
                    flist_samples = function(x) sum(x) > 3000)
print(phylo_genera)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 541 taxa and 4872 samples ]
## sample_data() Sample Data:       [ 4872 samples by 55 sample variables ]
## tax_table()   Taxonomy Table:    [ 541 taxa by 8 taxonomic ranks ]
print(phylo_species)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1391 taxa and 6282 samples ]
## sample_data() Sample Data:       [ 6282 samples by 55 sample variables ]
## tax_table()   Taxonomy Table:    [ 1391 taxa by 8 taxonomic ranks ]
phylo_species <- phylo_species %>% 
  phyloseq::prune_samples(phyloseq::sample_names(phylo_genera), .) %>% 
  phyloseq::filter_taxa(kOverA2(k = phyloseq::nsamples(phylo_genera) * 0.01,
                                         A = 1e-4),
                        prune = TRUE)
print(phylo_species)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 661 taxa and 4872 samples ]
## sample_data() Sample Data:       [ 4872 samples by 55 sample variables ]
## tax_table()   Taxonomy Table:    [ 661 taxa by 8 taxonomic ranks ]
print(phylo_otus)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 9259 taxa and 6282 samples ]
## sample_data() Sample Data:       [ 6282 samples by 52 sample variables ]
## tax_table()   Taxonomy Table:    [ 9259 taxa by 8 taxonomic ranks ]
phylo_otus <- phylo_otus %>% 
  phyloseq::prune_samples(phyloseq::sample_names(phylo_genera), .) %>% 
  phyloseq::filter_taxa(kOverA2(k = phyloseq::nsamples(phylo_genera) * 0.01,
                                A = 1e-4),
                        prune = TRUE)
print(phylo_otus)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 4073 taxa and 4872 samples ]
## sample_data() Sample Data:       [ 4872 samples by 52 sample variables ]
## tax_table()   Taxonomy Table:    [ 4073 taxa by 8 taxonomic ranks ]
save(phylo_genera, file = "data/phyloseq/genera.RData")
save(phylo_species, file = "data/phyloseq/species.RData")
save(phylo_otus, file = "data/phyloseq/otus.RData")